Importing Libraries and Data

In [2]:
# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline
from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
import math
from sklearn.metrics import r2_score
from sklearn import metrics
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
In [3]:
# Importing Wine Data
wine_data = pd.read_csv("/Users/vishruta/Downloads/winequality-red.csv",low_memory =False )
wine_data
Out[3]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
0 7.4 0.700 0.00 1.9 0.076 11.0 34.0 0.99780 3.51 0.56 9.4 5
1 7.8 0.880 0.00 2.6 0.098 25.0 67.0 0.99680 3.20 0.68 9.8 5
2 7.8 0.760 0.04 2.3 0.092 15.0 54.0 0.99700 3.26 0.65 9.8 5
3 11.2 0.280 0.56 1.9 0.075 17.0 60.0 0.99800 3.16 0.58 9.8 6
4 7.4 0.700 0.00 1.9 0.076 11.0 34.0 0.99780 3.51 0.56 9.4 5
... ... ... ... ... ... ... ... ... ... ... ... ...
1594 6.2 0.600 0.08 2.0 0.090 32.0 44.0 0.99490 3.45 0.58 10.5 5
1595 5.9 0.550 0.10 2.2 0.062 39.0 51.0 0.99512 3.52 0.76 11.2 6
1596 6.3 0.510 0.13 2.3 0.076 29.0 40.0 0.99574 3.42 0.75 11.0 6
1597 5.9 0.645 0.12 2.0 0.075 32.0 44.0 0.99547 3.57 0.71 10.2 5
1598 6.0 0.310 0.47 3.6 0.067 18.0 42.0 0.99549 3.39 0.66 11.0 6

1599 rows × 12 columns

Exploratory Data Analysis

In [4]:
wine_data.isnull().sum()
Out[4]:
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64
In [5]:
## Quality plot 
sns.catplot(x='quality', data = wine_data, kind='count')
Out[5]:
<seaborn.axisgrid.FacetGrid at 0x106be9d50>
In [6]:
## Correlation plot
plt.figure(figsize = (15,15))
cmap = sns.light_palette((210, 90, 60), input="husl")
#cmap= sns.color_palette("PuBuGn_d")
sns.heatmap(wine_data.corr(), cmap= cmap, annot=True, square=True)
plt.title("Correlation Plot")
Out[6]:
Text(0.5, 1, 'Correlation Plot')
In [7]:
#Correlation with Quality with respect to attributes
wine_data.corrwith(wine_data.quality).plot.bar(
        figsize = (20, 10), title = "Correlation with quality", fontsize = 15,
        rot = 45, grid = True)
Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x105fc9290>
In [8]:
# Converting Quality to 0 and 1
plt.figure(figsize=(10,20))
wine_data['quality_new1'] = (wine_data['quality'] >= 5.5)*1
wine_data['quality_new2'] = (wine_data['quality'] > 5)*1
wine_data['quality_new3'] = (wine_data['quality'] >= 6)*1
wine_data['quality_new4'] = (wine_data['quality'] > 6.5)*1

print('For >= 5.5', '\n' , wine_data.quality_new1.value_counts())

print('For > 5',  '\n' ,wine_data.quality_new2.value_counts())

print('For >= 6',  '\n' ,wine_data.quality_new3.value_counts())

print('For > 6.5',  '\n' ,wine_data.quality_new4.value_counts())

#sns.catplot(x='quality_new1', data = wine_data, kind='count')
#wine_data.quality_new1.value_counts()
For >= 5.5 
 1    855
0    744
Name: quality_new1, dtype: int64
For > 5 
 1    855
0    744
Name: quality_new2, dtype: int64
For >= 6 
 1    855
0    744
Name: quality_new3, dtype: int64
For > 6.5 
 0    1382
1     217
Name: quality_new4, dtype: int64
<Figure size 720x1440 with 0 Axes>
We can see that taking the threshold >=6 gives us a comparitively balanced data so we will use quality >=6 as our threshold to convert the quality to 0 and 1.
In [9]:
wine_data = wine_data.drop(['quality_new1','quality_new2','quality_new3','quality_new4'],axis =1)
wine_data
Out[9]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
0 7.4 0.700 0.00 1.9 0.076 11.0 34.0 0.99780 3.51 0.56 9.4 5
1 7.8 0.880 0.00 2.6 0.098 25.0 67.0 0.99680 3.20 0.68 9.8 5
2 7.8 0.760 0.04 2.3 0.092 15.0 54.0 0.99700 3.26 0.65 9.8 5
3 11.2 0.280 0.56 1.9 0.075 17.0 60.0 0.99800 3.16 0.58 9.8 6
4 7.4 0.700 0.00 1.9 0.076 11.0 34.0 0.99780 3.51 0.56 9.4 5
... ... ... ... ... ... ... ... ... ... ... ... ...
1594 6.2 0.600 0.08 2.0 0.090 32.0 44.0 0.99490 3.45 0.58 10.5 5
1595 5.9 0.550 0.10 2.2 0.062 39.0 51.0 0.99512 3.52 0.76 11.2 6
1596 6.3 0.510 0.13 2.3 0.076 29.0 40.0 0.99574 3.42 0.75 11.0 6
1597 5.9 0.645 0.12 2.0 0.075 32.0 44.0 0.99547 3.57 0.71 10.2 5
1598 6.0 0.310 0.47 3.6 0.067 18.0 42.0 0.99549 3.39 0.66 11.0 6

1599 rows × 12 columns

Linear Regression

In [10]:
features = ['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides',
            'free sulfur dioxide','total sulfur dioxide','density','pH','sulphates']
target=['quality']
X = wine_data[features]
y = wine_data[target]
In [11]:
# Perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=200)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
(1071, 10)
(528, 10)
(1071, 1)
(528, 1)
In [12]:
linear_regression = LinearRegression()
linear_regression_fit = linear_regression.fit(X_train, y_train)
linear_regression_prediction = linear_regression.predict(X_test)
# Measuring Performance Metrices
linear_regression_accuracy_train = linear_regression.score(X_train,y_train)
linear_regression_accuracy_test = linear_regression.score(X_test,y_test)
print('Accuracy on Train dataset', linear_regression_accuracy_train)
print('Accuracy on Test dataset', linear_regression_accuracy_test)
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, linear_regression_prediction)))
print('R^2 of Train dataset:', r2_score(y_train, linear_regression.predict(X_train)))
print('R^2 of Test dataset:', r2_score(y_test, linear_regression_prediction))
Accuracy on Train dataset 0.30260968120071396
Accuracy on Test dataset 0.335765567585228
RMSE: 0.6132779026992349
R^2 of Train dataset: 0.30260968120071396
R^2 of Test dataset: 0.335765567585228
In [13]:
#### Regularization using Ridge and Lasso Regression
In [14]:
#### Ridge Regression ####
from sklearn.linear_model import LinearRegression, Ridge, Lasso 
from sklearn.model_selection import train_test_split, cross_val_score 
from statistics import mean 

# List to maintain the different cross-validation scores 
cross_val_scores_ridge = []
# List to maintain the different values of alpha 
alpha = []

# Loop to compute the different values of cross-validation scores 
for i in range(1, 9): 
    ridgeModel = Ridge(alpha = i * 0.25) 
    ridgeModel.fit(X_train, y_train) 
    scores = cross_val_score(ridgeModel, X, y, cv = 10) 
    avg_cross_val_score = mean(scores)*100
    cross_val_scores_ridge.append(avg_cross_val_score) 
    alpha.append(i * 0.25)
    
# Loop to print the different values of cross-validation scores 
for i in range(0, len(alpha)): 
    print(str(alpha[i])+' : '+str(cross_val_scores_ridge[i])) 
0.25 : 11.235992580899326
0.5 : 11.230318772575174
0.75 : 11.205977742635383
1.0 : 11.158662332658567
1.25 : 11.097225103350274
1.5 : 11.028415426169147
1.75 : 10.956596203498254
2.0 : 10.884466279856635
In [76]:
# Performance Metrices Ridge using alpha as 0.25
from sklearn.linear_model import Ridge
# Building and fitting the Ridge Regression model 
ridgeModelChosen = Ridge(alpha = 0.25) 
ridgeModelChosen.fit(X_train, y_train) 
  
# Evaluating the Ridge Regression model 
print('Ridge Regression')
print('Accuracy on Train dataset:', ridgeModelChosen.score(X_train, y_train)) 
print('Accuracy on Test dataset:', ridgeModelChosen.score(X_test, y_test)) 
Ridge Regression
Accuracy on Train dataset: 0.24663411910950783
Accuracy on Test dataset: 0.22576983731122757
In [16]:
#### Lasso Regression ####
# List to maintain the cross-validation scores 
cross_val_scores_lasso = [] 
  
# List to maintain the different values of Lambda 
Lambda = [] 
  
# Loop to compute the cross-validation scores 
for i in range(1, 9): 
    lassoModel = Lasso(alpha = i * 0.25, tol = 0.0925) 
    lassoModel.fit(X_train, y_train) 
    scores = cross_val_score(lassoModel, X, y, cv = 10) 
    avg_cross_val_score = mean(scores)*100
    cross_val_scores_lasso.append(avg_cross_val_score) 
    Lambda.append(i * 0.25) 

# Loop to print the different values of cross-validation scores 
for i in range(0, len(alpha)): 
    print(str(alpha[i])+' : '+str(cross_val_scores_lasso[i]))
0.25 : -8.058490967059718
0.5 : -8.92612938218887
0.75 : -8.978363262534165
1.0 : -9.084881955905205
1.25 : -9.213976897437758
1.5 : -9.365648087131786
1.75 : -9.539895524987351
2.0 : -9.736719211004402
In [77]:
# Building and fitting the Lasso Regression Model 
lassoModelChosen = Lasso(alpha = 0.25) 
lassoModelChosen.fit(X_train, y_train) 
  
# Evaluating the Lasso Regression model 
print('Lasso Regression')
print('Accuracy on Train dataset:', lassoModelChosen.score(X_train, y_train))
print('Accuracy on Test dataset:', lassoModelChosen.score(X_test, y_test))
Lasso Regression
Accuracy on Train dataset: 0.0376413879401899
Accuracy on Test dataset: 0.032196666880541014

Converting the quality to 0 and 1 for further classification models

In [18]:
wine_data['quality_new'] = (wine_data['quality'] >= 6)*1
wine_data1=wine_data.drop(['quality'], axis=1)
sns.catplot(x='quality_new', data = wine_data1, kind='count')
wine_data1.quality_new.value_counts()
Out[18]:
1    855
0    744
Name: quality_new, dtype: int64
In [19]:
features_log = ['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides',
            'free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol']
target_classifier = ['quality_new']
In [20]:
X1 = wine_data1[features_log]
y1 = wine_data1[target_classifier]
In [21]:
# Perform train and test split
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.33, random_state=324)

Logistic Regression

In [22]:
###### Logistic Regression ######
# Fit on train set
from sklearn.linear_model import LogisticRegression
logistic_regression = LogisticRegression()
logistic_regression.fit(X1_train, y1_train)
logistic_regression_prediction = logistic_regression.predict(X1_test)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py:760: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
In [23]:
from sklearn.metrics import confusion_matrix
logistic_regression_accuracy_test = accuracy_score(y1_test,logistic_regression_prediction)
logistic_regression_accuracy_train = accuracy_score(y1_train,logistic_regression.predict(X1_train))
logistic_regression_cnf_mat = confusion_matrix(y1_test, logistic_regression_prediction)
print('Accuracy of Train Dataset:',logistic_regression_accuracy_train)
print('Accuracy of Test Dataset:',logistic_regression_accuracy_test)
print("Precision:",metrics.precision_score(y1_test, logistic_regression_prediction))
print("Recall:",metrics.recall_score(y1_test,logistic_regression_prediction))
print('Confusion matrix:', '\n',logistic_regression_cnf_mat)
Accuracy of Train Dataset: 0.7450980392156863
Accuracy of Test Dataset: 0.740530303030303
Precision: 0.7765151515151515
Recall: 0.7243816254416962
Confusion matrix: 
 [[186  59]
 [ 78 205]]
In [75]:
# Classification Report
from sklearn.metrics import classification_report
logistic_regression_cls_rep = classification_report(y1_test, logistic_regression_prediction)
print('Classification Report of Logistic Model:', '\n',logistic_regression_cls_rep)
Classification Report of Logistic Model: 
               precision    recall  f1-score   support

           0       0.70      0.76      0.73       245
           1       0.78      0.72      0.75       283

    accuracy                           0.74       528
   macro avg       0.74      0.74      0.74       528
weighted avg       0.74      0.74      0.74       528

In [25]:
class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(logistic_regression_cnf_mat), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
#Text(0.5,257.44,'Predicted label')
Out[25]:
Text(0.5, 257.44, 'Predicted label')
In [26]:
# Receiver Operating Curve
y1_pred_proba1 = logistic_regression.predict_proba(X1_test)[::,1]
fpr, tpr, thresholds = metrics.roc_curve(y1_test,  y1_pred_proba1)
auc = metrics.roc_auc_score(y1_test, y1_pred_proba1)
plt.plot(fpr,tpr,label="ROC, auc="+str(auc))
plt.legend(loc=4)
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("Receiver Operating Curve for Logistic Model")
plt.show()

Regularization using Ridge and Lasso Regression

In [27]:
# Importing libraries for Ridge, Lasso
from statistics import mean 

# List to maintain the different cross-validation scores 
cross_val_scores_ridge = []
# List to maintain the different values of alpha 
alpha = []

# Loop to compute the different values of cross-validation scores 
for i in range(1, 9): 
    ridgeModel = Ridge(alpha = i * 0.25) 
    ridgeModel.fit(X1_train, y1_train) 
    scores = cross_val_score(ridgeModel, X1, y1, cv = 10) 
    avg_cross_val_score = mean(scores)*100
    cross_val_scores_ridge.append(avg_cross_val_score) 
    alpha.append(i * 0.25)
    
# Loop to print the different values of cross-validation scores 
for i in range(0, len(alpha)): 
    print(str(alpha[i])+' : '+str(cross_val_scores_ridge[i])) 
0.25 : 21.48561880465623
0.5 : 21.50158603393926
0.75 : 21.510823292042975
1.0 : 21.51604236684321
1.25 : 21.518751544242008
1.5 : 21.51982817324151
1.75 : 21.5197978696456
2.0 : 21.518982942280857
In [28]:
# Performance Metrices Ridge using alpha as 1.5
from sklearn.linear_model import Ridge
# Building and fitting the Ridge Regression model 
ridgeModelChosen = Ridge(alpha = 1.5) 
ridgeModelChosen.fit(X1_train, y1_train) 
  
# Evaluating the Ridge Regression model 
print('Accuracy on Train dataset:', ridgeModelChosen.score(X1_train, y1_train)) 
print('Accuracy on Test dataset:', ridgeModelChosen.score(X1_test, y1_test)) 
Accuracy on Train dataset: 0.2847046246710163
Accuracy on Test dataset: 0.3100187101006209
In [29]:
# Lasso Regression
# List to maintain the cross-validation scores 
cross_val_scores_lasso = [] 
  
# List to maintain the different values of Lambda 
Lambda = [] 
  
# Loop to compute the cross-validation scores 
for i in range(1, 9): 
    lassoModel = Lasso(alpha = i * 0.25, tol = 0.0925) 
    lassoModel.fit(X1_train, y1_train) 
    scores = cross_val_score(lassoModel, X1, y1, cv = 10) 
    avg_cross_val_score = mean(scores)*100
    cross_val_scores_lasso.append(avg_cross_val_score) 
    Lambda.append(i * 0.25) 

# Loop to print the different values of cross-validation scores 
for i in range(0, len(alpha)): 
    print(str(alpha[i])+' : '+str(cross_val_scores_lasso[i]))
0.25 : -3.42900586289926
0.5 : -4.282896147401248
0.75 : -4.450876235626486
1.0 : -4.671732078828174
1.25 : -4.945463677006306
1.5 : -5.272071030160878
1.75 : -5.651554138291901
2.0 : -6.08391300139937
In [30]:
# Building and fitting the Lasso Regression Model 
lassoModelChosen = Lasso(alpha = 0.25) 
lassoModelChosen.fit(X1_train, y1_train) 
  
# Evaluating the Lasso Regression model 
print('Accuracy on Train dataset:', lassoModelChosen.score(X1_train, y1_train)) 
print('Accuracy on Test dataset:', lassoModelChosen.score(X1_test, y1_test)) 
Accuracy on Train dataset: 0.05633253950304662
Accuracy on Test dataset: 0.07830756627270707

Decision Tree

In [31]:
###### Decision Tree #######
import time
from datetime import date
import warnings
warnings.filterwarnings('ignore')
start_time = time.time()
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier(max_leaf_nodes=20, random_state=0)
decision_tree.fit(X1_train,y1_train)
today = date.today()
print("Run Time: %s seconds" % (time.time() - start_time))
decision_tree_predict = decision_tree.predict(X1_test)

decision_tree_acc_score_train = accuracy_score(y1_train, decision_tree.predict(X1_train))
decision_tree_acc_score_test = accuracy_score(y1_test, decision_tree_predict)

decision_tree_recall = metrics.recall_score(y1_test, decision_tree_predict)

# Calculate Confusion Matrix
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, recall_score, precision_score
decision_tree_conf_matrix = confusion_matrix(y1_test,decision_tree_predict)
print('confusion matrix: ','\n',decision_tree_conf_matrix)


from sklearn import metrics
print("Recall:",metrics.recall_score(y1_test, decision_tree_predict))
print("Accuracy on Training set:", decision_tree_acc_score_train)
print("Accuracy on Test set:", decision_tree_acc_score_test)
Run Time: 0.06136608123779297 seconds
confusion matrix:  
 [[179  66]
 [ 78 205]]
Recall: 0.7243816254416962
Accuracy on Training set: 0.7992530345471522
Accuracy on Test set: 0.7272727272727273
In [32]:
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
from IPython.display import Image  
import pydotplus
In [33]:
dt_feature_names = list(X1.columns)
dot_data = StringIO()
export_graphviz(decision_tree, out_file=dot_data,  
                filled=True, rounded=False,
                special_characters=True, feature_names=dt_feature_names)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())
Out[33]:
In [78]:
class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(decision_tree_conf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix Decision Tree', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
#Text(0.5,257.44,'Predicted label')
Out[78]:
Text(0.5, 257.44, 'Predicted label')
In [34]:
# Receiver Operating Curve
y2_pred_proba2 = decision_tree.predict_proba(X1_test)[::,1]
fpr, tpr, thresholds = metrics.roc_curve(y1_test,  y2_pred_proba2)
auc = metrics.roc_auc_score(y1_test, y2_pred_proba2)
plt.plot(fpr,tpr,label="ROC, auc="+str(auc))
plt.legend(loc=4)
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("Receiver Operating Curve for Decision Tree Model")
plt.show()
In [73]:
# Classification Report
from sklearn.metrics import classification_report
decision_tree_cls_rep = classification_report(y1_test, decision_tree_predict)
print('Classification Report of Decision Tree:', '\n',decision_tree_cls_rep)
Classification Report of Decision Tree: 
               precision    recall  f1-score   support

           0       0.70      0.73      0.71       245
           1       0.76      0.72      0.74       283

    accuracy                           0.73       528
   macro avg       0.73      0.73      0.73       528
weighted avg       0.73      0.73      0.73       528

In [38]:
### Optimization using GridSearch

from sklearn import decomposition, datasets
from sklearn import tree
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler
In [39]:
tuned_parameters = {'criterion' : ['gini', 'entropy'],'max_depth':[4,6,8,12],'random_state':[14]} 
# random_state ensures repeatable results

dt_clf = GridSearchCV(DecisionTreeClassifier(), tuned_parameters, cv=5, scoring='roc_auc')
dt_clf.fit(X1_train, y1_train)
Out[39]:
GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 6, 8, 12], 'random_state': [14]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='roc_auc', verbose=0)
In [40]:
print('The best model is: ', dt_clf.best_params_)
print('This model produces a mean cross-validated score (auc) of', dt_clf.best_score_)
The best model is:  {'criterion': 'gini', 'max_depth': 4, 'random_state': 14}
This model produces a mean cross-validated score (auc) of 0.7754927859834038
In [81]:
from sklearn.metrics import precision_score, accuracy_score
print("GridSearch Optimization")
y1_true, y1_pred2 = y1_test, dt_clf.predict(X1_test)
decision_tree_opt3_accuracy_test = accuracy_score(y1_true, y1_pred2)
decision_tree_opt3_accuracy_train = accuracy_score(y1_train , dt_clf.predict(X1_train))
decision_tree_opt3_recall = metrics.recall_score(y1_true, y1_pred2)
decision_tree_opt3_conf_matrix = confusion_matrix(y1_true, y1_pred2)
print('Precision on the evaluation set: ', precision_score(y1_true, y1_pred2))
print('Accuracy on the evaluation set: ', accuracy_score(y1_true, y1_pred2))
print("Recall:",metrics.recall_score(y1_true, y1_pred2))
print('Confusion Matrix: ','\n',confusion_matrix(y1_true, y1_pred2))
GridSearch Optimization
Precision on the evaluation set:  0.6829268292682927
Accuracy on the evaluation set:  0.6912878787878788
Recall: 0.7915194346289752
Confusion Matrix:  
 [[141 104]
 [ 59 224]]
In [82]:
# Classification Report
from sklearn.metrics import classification_report
decision_tree_opt3_cls_rep = classification_report(y1_test, dt_clf.predict(X1_test))
print('Classification Report of Decision Tree (GridSearch):', '\n',decision_tree_opt3_cls_rep)
Classification Report of Decision Tree (GridSearch): 
               precision    recall  f1-score   support

           0       0.70      0.58      0.63       245
           1       0.68      0.79      0.73       283

    accuracy                           0.69       528
   macro avg       0.69      0.68      0.68       528
weighted avg       0.69      0.69      0.69       528

In [83]:
class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(decision_tree_opt3_conf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix Decision Tree GridSearch', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
#Text(0.5,257.44,'Predicted label')
Out[83]:
Text(0.5, 257.44, 'Predicted label')
In [84]:
# Receiver Operating Curve
y5_pred_proba5 = decision_tree.predict_proba(X1_test)[::,1]
fpr, tpr, thresholds = metrics.roc_curve(y1_test,  y5_pred_proba5)
auc = metrics.roc_auc_score(y1_test, y5_pred_proba5)
plt.plot(fpr,tpr,label="ROC, auc="+str(auc))
plt.legend(loc=4)
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("Receiver Operating Curve for Decision Tree Model GridSearch")
plt.show()

Random Forest

In [43]:
###### Random Forest  #########
start_time = time.time()
from sklearn.ensemble import RandomForestClassifier 
random_forest = RandomForestClassifier(max_depth = 2, random_state = 0)
random_forest.fit(X1_train,y1_train)
today = date.today()
print("Run Time: %s seconds" % (time.time() - start_time))
random_forest_predict = random_forest.predict(X1_test)

random_forest_acc_score_train = accuracy_score(y1_train, random_forest.predict(X1_train))
random_forest_acc_score_test = accuracy_score(y1_test, random_forest_predict)
random_forest_acc_score_recall = metrics.recall_score(y1_test, random_forest_predict)


# Calculate Confusion Matrix
random_forest_conf_matrix = confusion_matrix(y1_test,random_forest_predict)
print('confusion matrix: ','\n',random_forest_conf_matrix)


from sklearn import metrics
print("Recall:",random_forest_acc_score_recall)
print("Accuracy on Training set:", random_forest_acc_score_train)
print("Accuracy on Test set:", random_forest_acc_score_test)
Run Time: 0.1771681308746338 seconds
confusion matrix:  
 [[183  62]
 [ 67 216]]
Recall: 0.7632508833922261
Accuracy on Training set: 0.7478991596638656
Accuracy on Test set: 0.7556818181818182
In [85]:
class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(random_forest_conf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix Random Forest ', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
#Text(0.5,257.44,'Predicted label')
Out[85]:
Text(0.5, 257.44, 'Predicted label')
In [44]:
# Receiver Operating Curve
y3_pred_proba3 = random_forest.predict_proba(X1_test)[::,1]
fpr, tpr, thresholds = metrics.roc_curve(y1_test,  y3_pred_proba3)
auc = metrics.roc_auc_score(y1_test, y3_pred_proba3)
plt.plot(fpr,tpr,label="ROC, auc="+str(auc))
plt.legend(loc=4)
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("Receiver Operating Curve for Random Forest Model")
plt.show()
In [45]:
# Classification Report
from sklearn.metrics import classification_report
random_forest_cls_rep = classification_report(y1_test, random_forest_predict)
print('Classification Report of Random Forest:', '\n',random_forest_cls_rep)
Classification Report of Random Forest: 
               precision    recall  f1-score   support

           0       0.73      0.75      0.74       245
           1       0.78      0.76      0.77       283

    accuracy                           0.76       528
   macro avg       0.75      0.76      0.75       528
weighted avg       0.76      0.76      0.76       528

In [47]:
# Optimization using GridSearch
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
tuned_parameters = {'max_features': [0.5,0.6,0.7,0.8,0.9,1.0], 
                    'max_depth': [2,3,4,5,6,7],'min_samples_leaf':[1,10,100],'random_state':[14]} 
# random_state ensures repeatable results

rf_clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=5, scoring='roc_auc')
rf_clf.fit(X1_train, y1_train)
Out[47]:
GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': [2, 3, 4, 5, 6, 7],
                         'max_features': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                         'min_samples_leaf': [1, 10, 100],
                         'random_state': [14]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='roc_auc', verbose=0)
In [49]:
print('The best model is: ', rf_clf.best_params_)
print('This model produces a mean cross-validated score (auc) of', rf_clf.best_score_)
The best model is:  {'max_depth': 7, 'max_features': 0.8, 'min_samples_leaf': 1, 'random_state': 14}
This model produces a mean cross-validated score (auc) of 0.8446050297020549
In [50]:
from sklearn.metrics import precision_score, accuracy_score
print("GridSearch Optimization")
y3_true1, y3_pred4 = y1_test, rf_clf.predict(X1_test)

random_forest_opt1_accuracy_test = accuracy_score(y3_true1, y3_pred4)
random_forest_opt1_accuracy_train = accuracy_score(y1_train, rf_clf.predict(X1_train))
random_forest_opt1_recall = metrics.recall_score(y3_true1, y3_pred4)


print('Precision on the evaluation set: ', precision_score(y3_true1, y3_pred4))
print('Accuracy on the evaluation set: ', accuracy_score(y3_true1, y3_pred4))
print("Recall:",random_forest_opt1_recall)
print('Confusion Matrix: ','\n',confusion_matrix(y3_true1, y3_pred4))
GridSearch Optimization
Precision on the evaluation set:  0.8021201413427562
Accuracy on the evaluation set:  0.7878787878787878
Recall: 0.8021201413427562
Confusion Matrix:  
 [[189  56]
 [ 56 227]]
In [51]:
# Classification Report
from sklearn.metrics import classification_report
random_forest_opt1_cls_rep = classification_report(y1_test, rf_clf.predict(X1_test))
print('Classification Report of Random Forest(GridSearch):', '\n',random_forest_opt1_cls_rep)
Classification Report of Random Forest(GridSearch): 
               precision    recall  f1-score   support

           0       0.77      0.77      0.77       245
           1       0.80      0.80      0.80       283

    accuracy                           0.79       528
   macro avg       0.79      0.79      0.79       528
weighted avg       0.79      0.79      0.79       528

In [86]:
class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(confusion_matrix(y3_true1, y3_pred4)), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix Random Forest GridSearch ', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
#Text(0.5,257.44,'Predicted label')
Out[86]:
Text(0.5, 257.44, 'Predicted label')
In [87]:
# Receiver Operating Curve
y6_pred_proba6 = random_forest.predict_proba(X1_test)[::,1]
fpr, tpr, thresholds = metrics.roc_curve(y1_test,  y6_pred_proba6)
auc = metrics.roc_auc_score(y1_test, y6_pred_proba6)
plt.plot(fpr,tpr,label="ROC, auc="+str(auc))
plt.legend(loc=4)
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("Receiver Operating Curve for Random Forest using GridSearch")
plt.show()
In [52]:
# Optimization using Random Search
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn import ensemble

param_dist={'n_estimators':[100,200,300,400,500,600],'criterion':['gini','entropy'],'max_depth':randint(1,15),'max_features':randint(1,9),'min_samples_leaf':randint(1,9)}
rf1_clf=ensemble.RandomForestClassifier()
rf1_clf_cv=RandomizedSearchCV(rf1_clf,param_distributions=param_dist,cv=5)
rf1_clf_cv.fit(X1_train,y1_train)
print("Tuned Random Forest Parameters: {}".format(rf1_clf_cv.best_params_)) 
print("Best score is {}".format(rf1_clf_cv.best_score_)) 
Tuned Random Forest Parameters: {'criterion': 'entropy', 'max_depth': 11, 'max_features': 7, 'min_samples_leaf': 4, 'n_estimators': 600}
Best score is 0.7666116061725712
In [53]:
# Putting these hyperparameters into our model
rf1_clf=ensemble.RandomForestClassifier(criterion='entropy',max_depth=10,max_features=6,min_samples_leaf=1,n_estimators=400)
rf1_clf.fit(X1_train,y1_train)
Out[53]:
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=10, max_features=6,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=400,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
In [54]:
print("RandomSearch optimization")
today = date.today()
print("Run Time: %s seconds" % (time.time() - start_time))
rf1_clf1 = rf1_clf.predict(X1_test)
random_forest_opt2_accuracy = accuracy_score(y1_test, rf1_clf1)

rf1_clf1_conf_matrix = confusion_matrix(y1_test,rf1_clf1)
print('confusion matrix: ','\n',rf1_clf1_conf_matrix)

print("Recall:",metrics.recall_score(y1_test,rf1_clf1))
print("Accuracy on Training set:", accuracy_score(y1_train, rf1_clf.predict(X1_train)))
print("Accuracy on Test set:", random_forest_opt2_accuracy)
RandomSearch optimization
Run Time: 230.66269326210022 seconds
confusion matrix:  
 [[195  50]
 [ 47 236]]
Recall: 0.833922261484099
Accuracy on Training set: 0.9906629318394025
Accuracy on Test set: 0.8162878787878788
In [71]:
# Classification Report
from sklearn.metrics import classification_report
random_forest_opt2_cls_rep = classification_report(y1_test, rf1_clf.predict(X1_test))
print('Classification Report of Random Forest(Random Search):', '\n',random_forest_opt2_cls_rep)
Classification Report of Random Forest(Random Search): 
               precision    recall  f1-score   support

           0       0.81      0.80      0.80       245
           1       0.83      0.83      0.83       283

    accuracy                           0.82       528
   macro avg       0.82      0.81      0.82       528
weighted avg       0.82      0.82      0.82       528

In [88]:
class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(rf1_clf1_conf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix Random Forest Random Search ', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
#Text(0.5,257.44,'Predicted label')
Out[88]:
Text(0.5, 257.44, 'Predicted label')
In [56]:
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
from IPython.display import Image  
import pydotplus
In [57]:
estimator = rf1_clf.estimators_[5]
dt_feature_names = list(X1.columns)
export_graphviz(estimator, 
                out_file='tree.dot', 
                feature_names = dt_feature_names,
                rounded = True, proportion = False, 
                precision = 2, filled = True)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])
# Display in jupyter notebook
from IPython.display import Image
Image(filename = 'tree.png')
Out[57]:
In [58]:
# Receiver Operating Curve
y4_pred_proba4 = rf1_clf.predict_proba(X1_test)[::,1]
fpr, tpr, thresholds = metrics.roc_curve(y1_test,  y4_pred_proba4)
auc = metrics.roc_auc_score(y1_test, y4_pred_proba4)
plt.plot(fpr,tpr,label="ROC, auc="+str(auc))
plt.legend(loc=4)
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("Receiver Operating Curve for Optimized Random Forest Model")
plt.show()

Gradient Boosting Model

In [59]:
# Gradient Boost Model
from sklearn import metrics
start_time = time.time()
from sklearn.ensemble import GradientBoostingClassifier
gradient_boost = GradientBoostingClassifier()
gradient_boost.fit(X1_train,y1_train)
today = date.today()
print("Run Time: %s seconds" % (time.time() - start_time))
gradient_boost_predict = gradient_boost.predict(X1_test)

gradient_boost_acc_score_train = accuracy_score(y1_train, gradient_boost.predict(X1_train))
gradient_boost_acc_score_test = accuracy_score(y1_test, gradient_boost_predict)

gradient_boost_recall = metrics.recall_score(y1_test,gradient_boost_predict)

# Calculate Confusion Matrix
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, recall_score, precision_score
gradient_boost_conf_matrix = confusion_matrix(y1_test,gradient_boost_predict)
print('confusion matrix: ','\n',gradient_boost_conf_matrix)



print("Recall:",gradient_boost_recall)
print("Accuracy on Training set:",gradient_boost_acc_score_train)
print("Accuracy on Test set:", gradient_boost_acc_score_test)
Run Time: 0.20320677757263184 seconds
confusion matrix:  
 [[187  58]
 [ 56 227]]
Recall: 0.8021201413427562
Accuracy on Training set: 0.8786181139122315
Accuracy on Test set: 0.7840909090909091
In [89]:
class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(gradient_boost_conf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix Gradient Boost ', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
#Text(0.5,257.44,'Predicted label')
Out[89]:
Text(0.5, 257.44, 'Predicted label')
In [60]:
# Receiver Operating Curve
y5_pred_proba5 = gradient_boost.predict_proba(X1_test)[::,1]
fpr, tpr, thresholds = metrics.roc_curve(y1_test,  y5_pred_proba5)
auc = metrics.roc_auc_score(y1_test, y5_pred_proba5)
plt.plot(fpr,tpr,label="ROC, auc="+str(auc))
plt.legend(loc=4)
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("Receiver Operating Curve for Gradient Boost Model")
plt.show()
In [70]:
# Classification Report
from sklearn.metrics import classification_report
gradient_boost_cls_rep = classification_report(y1_test,gradient_boost_predict)
print('Classification Report of Gradient Boosting:', '\n',gradient_boost_cls_rep)
Classification Report of Gradient Boosting: 
               precision    recall  f1-score   support

           0       0.77      0.76      0.77       245
           1       0.80      0.80      0.80       283

    accuracy                           0.78       528
   macro avg       0.78      0.78      0.78       528
weighted avg       0.78      0.78      0.78       528

Optimization using GridSearch

In [63]:
# Optimization using GridSearch
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
In [64]:
#GridSearch Gradient boost model
gb_clf=GridSearchCV(estimator=GradientBoostingClassifier(),cv=10,param_grid=dict({'n_estimators':[500]}))
gb_clf.fit(X1_train,y1_train)
Out[64]:
GridSearchCV(cv=10, error_score=nan,
             estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                                  criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_no_change=None,
                                                  presort='deprecated',
                                                  random_state=None,
                                                  subsample=1.0, tol=0.0001,
                                                  validation_fraction=0.1,
                                                  verbose=0, warm_start=False),
             iid='deprecated', n_jobs=None, param_grid={'n_estimators': [500]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)
In [66]:
print('The best model is: ', gb_clf.best_params_)
print('This model produces a mean cross-validated score (auc) of', gb_clf.best_score_)
The best model is:  {'n_estimators': 500}
This model produces a mean cross-validated score (auc) of 0.7693925233644859
In [67]:
from sklearn.metrics import precision_score, accuracy_score
print("GridSearch Optimization")
y4_true2, y4_pred7 = y1_test, gb_clf.predict(X1_test)
gradient_boost_opt1_accuracy = accuracy_score(y4_true2, y4_pred7)

gradient_boost_opt1_accuracy_test = accuracy_score(y4_true2, y4_pred7)
gradient_boost_opt1_accuracy_train = accuracy_score(y1_train, gb_clf.predict(X1_train))
gradient_boost_opt1_recall = metrics.recall_score(y4_true2, y4_pred7)

print('Precision on the evaluation set: ', precision_score(y4_true2, y4_pred7))
print('Accuracy on the evaluation set: ', accuracy_score(y4_true2, y4_pred7))
print("Recall:",metrics.recall_score(y4_true2, y4_pred7))
print('Confusion Matrix: ','\n',confusion_matrix(y4_true2, y4_pred7))
GridSearch Optimization
Precision on the evaluation set:  0.8127208480565371
Accuracy on the evaluation set:  0.7992424242424242
Recall: 0.8127208480565371
Confusion Matrix:  
 [[192  53]
 [ 53 230]]
In [68]:
# Classification Report
from sklearn.metrics import classification_report
gradient_boost_opt1_cls_rep = classification_report(y1_test,gb_clf.predict(X1_test))
print('Classification Report of Gradient Boosting(GridSearch):', '\n',gradient_boost_opt1_cls_rep)
Classification Report of Gradient Boosting(GridSearch): 
               precision    recall  f1-score   support

           0       0.78      0.78      0.78       245
           1       0.81      0.81      0.81       283

    accuracy                           0.80       528
   macro avg       0.80      0.80      0.80       528
weighted avg       0.80      0.80      0.80       528

In [90]:
class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(confusion_matrix(y4_true2, y4_pred7)), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix Gradient Boost GridSearch ', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
#Text(0.5,257.44,'Predicted label')
Out[90]:
Text(0.5, 257.44, 'Predicted label')
In [91]:
# Receiver Operating Curve
y9_pred_proba9 = gradient_boost.predict_proba(X1_test)[::,1]
fpr, tpr, thresholds = metrics.roc_curve(y1_test,  y9_pred_proba9)
auc = metrics.roc_auc_score(y1_test, y9_pred_proba9)
plt.plot(fpr,tpr,label="ROC, auc="+str(auc))
plt.legend(loc=4)
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("Receiver Operating Curve for Gradient Boost Model Grid Search")
plt.show()

Comparison of Different Classification models

In [69]:
data = {'Model':  ['Decision Tree' , 'Random Forest' , 'Gradient Boost'],
        'Accuracy Train' : [decision_tree_acc_score_train, random_forest_acc_score_train , gradient_boost_acc_score_train],
        'Accuracy Test': [decision_tree_acc_score_test , random_forest_acc_score_test , gradient_boost_acc_score_test],
        'Recall': [decision_tree_recall , random_forest_acc_score_recall ,gradient_boost_recall],
         'GridSearch Accuracy Train' : [decision_tree_opt3_accuracy_train, random_forest_opt1_accuracy_train ,gradient_boost_opt1_accuracy_train],
        'GridSearch Accuracy Test' : [decision_tree_opt3_accuracy_test , random_forest_opt1_accuracy_test , gradient_boost_opt1_accuracy_test],
        'GridSearch Recall' : [decision_tree_opt3_recall , random_forest_opt1_recall , gradient_boost_opt1_recall]
        }

df = pd.DataFrame (data, columns = ['Model','Accuracy Train','Accuracy Test','Recall','GridSearch Accuracy Test', 'GridSearch Accuracy Train' , 'GridSearch Recall'])
df.set_index('Model', inplace=True)
df
Out[69]:
Accuracy Train Accuracy Test Recall GridSearch Accuracy Test GridSearch Accuracy Train GridSearch Recall
Model
Decision Tree 0.799253 0.727273 0.724382 0.691288 0.774043 0.791519
Random Forest 0.747899 0.755682 0.763251 0.787879 0.910364 0.802120
Gradient Boost 0.878618 0.784091 0.802120 0.799242 0.996265 0.812721
In [ ]: